import os
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
pio.renderers.default = 'notebook'Job Skills Analysis
df = pd.read_csv('lightcast_job_postings.csv', low_memory=False)for col in ['COMMON_SKILLS_NAME', 'SPECIALIZED_SKILLS_NAME', 'SOFTWARE_SKILLS_NAME']:
if col in df.columns:
df[col] = df[col].replace(r'^\s*$', pd.NA, regex=True).dropna()skills_series = df['SKILLS_NAME'].dropna()
all_skills = skills_series.str.split(',').explode().str.strip()
skill_counts = all_skills.value_counts().reset_index()
skill_counts.columns = ['Skill', 'Count']
top_n = 20
fig = px.bar(skill_counts.head(top_n),
x='Skill',
y='Count',
title=f"Top {top_n} Most Common Job Skills",
labels={'Count': 'Frequency', 'Skill': 'Skill'},
text='Count',
color='Skill')
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_tickangle=-45)
fig.show()
top_20_job_skills = os.path.join('iframe_figures', 'top_20_job_skills.png')
fig.write_image("iframe_figures/top_skills.png")def process_skills(df, column_name, skill_type):
skill_counts = (
df[column_name].dropna()
.str.split(',')
.explode()
.str.strip()
.value_counts()
.reset_index()
)
skill_counts.columns = ['Skill', 'Count']
skill_counts['Type'] = skill_type
skill_counts['Count'] = skill_counts['Count'].astype(int)
return skill_counts
# Process each skill type (pass the skill_type string!)
common_skills = process_skills(df, 'COMMON_SKILLS_NAME', 'Common')
specialized_skills = process_skills(df, 'SPECIALIZED_SKILLS_NAME', 'Specialized')
software_skills = process_skills(df, 'SOFTWARE_SKILLS_NAME', 'Software')
# Combine all skills
combined_skills = pd.concat([common_skills, specialized_skills, software_skills])
def clean_skill_text(skill):
return skill.strip().strip('"').strip("'").strip('[]').strip()
# Apply this to all skill names
combined_skills['Skill'] = combined_skills['Skill'].apply(clean_skill_text)
# Get top N skills from each group
top_n = 10
top_skills = (
combined_skills.groupby('Type', group_keys=False)
.apply(lambda g: g.nlargest(top_n, 'Count'))
)
# Plot grouped bar chart
fig = px.bar(top_skills,
x='Skill',
y='Count',
color='Type',
barmode='group',
title=f"Top Skills by Type",
labels={'Count': 'Frequency'},
text='Count')
fig.update_layout(
xaxis_tickangle=-35,
xaxis_title='Skill',
yaxis_title='Frequency',
title_x=0.5,
uniformtext_minsize=8,
uniformtext_mode='hide',
legend_title_text='Skill Type',
)
fig.update_traces(textposition='outside')
fig.show()
fig.write_image("iframe_figures/top_skills_by_type.png")/tmp/ipykernel_8534/3933253511.py:32: DeprecationWarning:
DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
Team Skills Analysis
skills_data = {
"Name": ["Andrey", "Jason", "Moiz", "Jitvan", "Prabu"],
"Python (Programming Language)": [5, 3, 4, 2, 4],
"Sql (Programming Language)": [4, 2, 5, 3, 2],
"Power BI": [3, 1, 4, 2, 3],
"Computer Science": [2, 2, 3, 1, 5],
}
df_skills = pd.DataFrame(skills_data).set_index("Name")
df_skills.columns = df_skills.columns.str.lower().str.replace(' ', '_')plt.figure(figsize=(8, 6))
sns.heatmap(df_skills, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Team Skill Levels Heatmap")
plt.show()def extract_top_skills(df, columns, top_n=10):
combined = pd.Series(dtype="str")
for col in columns:
if col in df.columns:
combined = pd.concat([combined, df[col].dropna().str.split(',').explode().str.strip()])
top_skills = combined.value_counts().head(top_n).index.tolist()
return top_skills
# Top skills from job postings
job_required_skills = extract_top_skills(df, ['SPECIALIZED_SKILLS_NAME', 'SOFTWARE_SKILLS_NAME'], top_n=10)
print("Top Job Skills in Demand:\n")
for i, skill in enumerate(job_required_skills, 1):
print(f"{i}. {skill.title()}")Top Job Skills in Demand:
1. "Sql (Programming Language)"
2. "Data Analysis"
3. "Python (Programming Language)"
4. "Dashboard"
5. "Sap Applications"
6. "Project Management"
7. [
"Power Bi"
8. "Business Process"
9. "Business Requirements"
10. "Finance"
import re
def clean_skill_name(skill):
# Remove brackets, quotes, extra spaces
skill = re.sub(r'[\[\]\'"]', '', skill)
return skill.strip()
def extract_top_skills(df, columns, top_n=10):
combined = pd.Series(dtype="str")
for col in columns:
if col in df.columns:
combined = pd.concat([combined, df[col].dropna().str.split(',').explode().map(clean_skill_name)])
top_counts = combined.value_counts().head(top_n)
return top_counts.reset_index().rename(columns={'index': 'Skill', 0: 'Count'})
top_skills_df = extract_top_skills(df, ['SPECIALIZED_SKILLS_NAME', 'SOFTWARE_SKILLS_NAME'])
display(top_skills_df)
# Use clean, normalized version for matching with team skills
job_required_skills = top_skills_df['Skill'].str.lower().str.replace(' ', '_').tolist()
matched_skills = [skill for skill in job_required_skills if skill in df_skills.columns]
missing_skills = [skill for skill in job_required_skills if skill not in df_skills.columns]
if matched_skills:
plt.figure(figsize=(10, 6))
sns.heatmap(df_skills[matched_skills], annot=True, cmap="YlGnBu", linewidths=0.5)
plt.title("Team Skill Levels for In-Demand Job Skills")
plt.savefig("iframe_figures/team_skills_heatmap.png", bbox_inches='tight', dpi=300)
plt.show()
else:
print("No matched skills to visualize.")| Skill | count | |
|---|---|---|
| 0 | SQL (Programming Language) | 43478 |
| 1 | Data Analysis | 28522 |
| 2 | Python (Programming Language) | 24510 |
| 3 | SAP Applications | 24256 |
| 4 | Dashboard | 23724 |
| 5 | Tableau (Business Intelligence Software) | 23568 |
| 6 | Power BI | 21702 |
| 7 | Computer Science | 17293 |
| 8 | Project Management | 13715 |
| 9 | Business Process | 13286 |